## Stephen Moon
## Statistics Thesis

# Setup ####

library(pscl)
library(tidyverse)
library(rpart)
library(randomForest)
library(arm)
library(glmnet)
library(caret)
library(gridExtra)
library(mgcv)
library(stargazer)
library(schoenberg)

set.seed(5555)

train.prop <- 0.75

setwd("/Users/smoon/Desktop/Thesis/Data/")

source("ThesisHelperFunctions.R")

results <- read.csv("RacesWithDIME.csv", header = T, as.is = T)
results$year.x <- as.factor(results$year.x)
results$partyControl <- as.factor(results$partyControl)
results <- results[-which(results$year.x == 1980),]
N <- nrow(results)

setwd("/Users/smoon/Desktop/Thesis/Plots/")

#fix a weird missing data issue
for(i in 1:nrow(results)) {
  if(is.na(results$lagDemWinMargin[i] & results$year.x[i] == 2012)) {
    if(results$party[i] == "Republican") {
      results$lagDemWinMargin[i] = -1
    } else {
      results$lagDemWinMargin[i] = 1
    }
  }
}

#train-test split
train.idx <- sample(1:N, size = train.prop*N, replace = F)
train <- results[train.idx,]
test <- results[-train.idx,]

#try splitting the models by party
#pres vote share threshold for blue dist
threshold <- 0.45

train.D <- train[train$DemPresVs >= threshold,]
test.D <- test[test$DemPresVs >= threshold,]

train.R <- train[train$DemPresVs <= 1 - threshold,]
test.R <- test[test$DemPresVs <= 1 - threshold,]

# Logistic Regression ####

#full model
cv.mod <- train(form = as.factor(uncontested) ~ 
                  as.factor(year.x) + prcntBA +
                  prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
                  partyControl + prcntForeignBorn + under10k + over35k +
                  over50k + over75k + over100k + over150k + prcntNotEmploy +
                  gini + prcntOld + lagPresDiff50 +
                  medianIncome + lagUncontested +
                  lagWinMargin, 
                  data = train,
                  family = binomial,
                trControl = trainControl(method = "cv", number = 10),
                method = "glm", na.action = na.omit)
summary(cv.mod)

draw.prob.plot(cv.mod, "ProbabilityPlotFullModel.png", "Full Model")
# print(calculate.accuracy.measure(cv.mod))
# 
# ptm <- proc.time()
# step <- train(form = as.factor(uncontested) ~
#                   as.factor(year.x) + prcntBA +
#                   prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
#                   partyControl + prcntForeignBorn + under10k + over35k +
#                   over50k + over75k + over100k + over150k + prcntNotEmploy +
#                   gini + prcntOld + lagPresDiff50 +
#                   medianIncome + lagUncontested +
#                   lagWinMargin,
#                 data = train,
#                 family = binomial,
#                 trControl = trainControl(method = "cv", number = 2),
#                 method = "glmStepAIC", na.action = na.omit, trace = FALSE)
# summary(step)
# proc.time() - ptm
# 
# draw.prob.plot(step, "ProbabilityPlotStepwiseModel.png", "Stepwise Model")
# print(calculate.accuracy.measure(step))

#figure out how many missing for each variable
#missing.cols <- colSums(is.na(results))
#missing.cols <- missing.cols[missing.cols != 0]
#write.csv(missing.cols, "MissingCols.csv")

results$predProbUncontested <- predict(cv.mod, newdata = results, 
                                       type = "prob", na.action = na.pass)[,2]
#update train and test sets with these predictions
train <- results[train.idx,]
test <- results[-train.idx,]

#try a random forest model
rf <- randomForest(as.factor(uncontested) ~ year.x + prcntBA +
                    prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
                    partyControl + prcntForeignBorn + under10k + over35k +
                    over50k + over75k + over100k + over150k + prcntNotEmploy +
                    gini + prcntOld + lagPresDiff50 +
                    medianIncome + lagUncontested +
                    lagWinMargin, data = train, ntree = 500,
                   na.action = na.omit)

draw.prob.plot(rf, "ProbabilityPlotRandomForest.png", "Random Forest")

# Imputation ####

#should only train and test this next set on things that are verifiable (ie contested)
train <- train[train$uncontested == 0,]
test <- test[test$uncontested == 0,]

#try a simple imputation with same variables as the full mod
#and then add in the pred prob uncontested
simplestImputation <- train(form = DemWinMargin ~ as.factor(year.x) + prcntBA +
                              prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
                              partyControl + prcntForeignBorn + under10k + over35k +
                              over50k + over75k + over100k + over150k + prcntNotEmploy +
                              gini + prcntOld + lagPresDiff50 +
                              medianIncome + lagUncontested +
                              lagWinMargin, 
                            data = train,
                            trControl = trainControl(method = "cv", number = 10),
                            method = "lm", na.action = na.omit)
summary(simplestImputation)

simplestImputation <- train(form = DemWinMargin ~ as.factor(year.x) + prcntBA +
                              prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
                              prcntForeignBorn + under10k + over35k +
                              over50k + over75k + over100k + over150k + prcntNotEmploy +
                              gini + prcntOld + lagPresDiff50 +
                              medianIncome +
                              lagWinMargin, 
                            data = train,
                            trControl = trainControl(method = "cv", number = 10),
                            method = "lm", na.action = na.omit)
summary(simplestImputation)

simplestImputationLM <- lm(DemWinMargin ~ as.factor(year.x) + prcntBA +
                              prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
                              prcntForeignBorn + under10k + over35k +
                              over50k + over75k + over100k + over150k + prcntNotEmploy +
                              gini + prcntOld + lagPresDiff50 +
                              medianIncome +
                              lagWinMargin, 
                            data = train,
                            na.action = na.omit)

weightedImputation <- train(form = DemWinMargin ~ as.factor(year.x) + prcntBA +
                              prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
                              prcntForeignBorn + under10k + over35k +
                              over50k + over75k + over100k + over150k + prcntNotEmploy +
                              gini + prcntOld + lagPresDiff50 +
                              medianIncome +
                              lagWinMargin, 
                            data = train,
                            trControl = trainControl(method = "cv", number = 10),
                            method = "lm", na.action = na.omit,
                            weights = predProbUncontested)
summary(weightedImputation)

weightedImputationLM <- lm(DemWinMargin ~ as.factor(year.x) + prcntBA +
                             prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
                             prcntForeignBorn + under10k + over35k +
                             over50k + over75k + over100k + over150k + prcntNotEmploy +
                             gini + prcntOld + lagPresDiff50 +
                             medianIncome +
                             lagWinMargin, 
                           data = train,
                           na.action = na.omit,
                           weights = predProbUncontested)

rf.imputation <- randomForest(DemWinMargin ~ predProbUncontested + 
                                year.x + prcntBA +
                                prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
                                partyControl + prcntForeignBorn + under10k + over35k +
                                over50k + over75k + over100k + over150k + prcntNotEmploy +
                                gini + prcntOld + lagPresDiff50 +
                                medianIncome + lagUncontested +
                                lagWinMargin, data = train, ntree = 500,
                              na.action = na.omit)
importance(rf.imputation)
print(rf.imputation)
predWinMargin.rf <- predict(rf.imputation, newdata = test, na.action = na.pass)
MAE.rf <- mean(abs(predWinMargin.rf - test$DemWinMargin), na.rm = T)

  

#use test set to get avg win margin error
predWinMargin <- predict(simplestImputation, newdata = test, na.action = na.pass)
MSE <- mean((predWinMargin - test$DemWinMargin)^2, na.rm = T)
MAE.unweight <- mean(abs(predWinMargin - test$DemWinMargin), na.rm = T)

#use test set to get avg win margin error for weighted
predWinMarginWeight <- predict(weightedImputation, newdata = test, na.action = na.pass)
MAE.weight <- mean(abs(predWinMarginWeight - test$DemWinMargin), na.rm = T)

#make this prettier
png(filename="WinMarginVsPredProb.png", width = 1000, height = 580)
plot(results$predProbUncontested[results$Rwinner], 
     results$AbsPresVs[results$Rwinner], xlim = c(0, 1),
     ylim = c(0, 1), pch = 18, col = "Red",
     ylab = "Absolute Margin of Victory",
     xlab = "Probability of Uncontestedness",
     main = "Contested Races", cex = 0.45, cex.lab = 1.5, cex.main = 1.5)
points(results$predProbUncontested[results$Rwinner == F], 
       results$AbsPresVs[results$Rwinner == F],
       pch = 1, col = "Blue", cex = 0.35)
abline(simplestImputation)
dev.off()

#start working on prediction of outcome

# Linear Model Assumption Checks ####

#first make relevant cols data frame
#note that lagUncontested not included bc singularities
cols.to.get <- c("year.x", "prcntBA",
                 "prcntHS", "prcntAsian", "prcntWhiteAll", "dwnom1",
                 "partyControl", "prcntForeignBorn", "under10k", "over35k",
                 "over50k", "over75k", "over100k", "over150k", "prcntNotEmploy",
                 "gini", "prcntOld", "lagPresDiff50",
                 "medianIncome", "lagWinMargin", "predProbUncontested")

indices <- c()
for(col in cols.to.get) {
  indices <- c(indices, which(names(train) == col)[1])
}

relevant <- results[,indices]
#true if Dem party control
relevant$partyControl <- as.numeric(relevant$partyControl == "D")

relevant$ID <- seq.int(nrow(relevant))

idcol <- ncol(relevant)

#make linearity plots
continuousCols <- c("prcntBA", "prcntHS", "prcntAsian", "prcntWhiteAll", "dwnom1",
                    "prcntForeignBorn", "under10k", "over35k",
                    "over50k", "over75k", "over100k", "over150k", "prcntNotEmploy",
                    "gini", "prcntOld", "lagPresDiff50",
                    "medianIncome", "lagWinMargin")

originalVariables <- c("abroadPrcnt", "recentArrivalPrcnt", "totalPopBirthPlace", 
               "prcntForeignBorn", "prcntExAliens", "totalHouseholds",
               "under10k", "over10k", "over15k", "over25k", "over35k",
               "over50k", "over75k", "over100k", "over150k", "over200k",
               "meanIncome", "medianIncome", "prcntUnemp","prcntNotEmploy",
               "prcntBA", "prcntHS", "prcntAsian", "prcntBlack", "prcntBlackNotHisp",
               "prcntHisp", "prcntMulti", "prcntWhite", "prcntWhiteAll",
               "prcntNotHisp", "prcntOld", "medianAge", "gini", "dwnom1", "lagPresDiff50",
               "lagWinMargin")
goodNames <- c("abroad pct.", "recent arrival pct.", "total population, birthplace",
               "foreign born pct.", "ex aliens pct.", "total households",
               "under 10k", "over 10k", "over 15k", "over 25k", "over 35k",
               "over 50k", "over 75k", "over 100k", "over 150k", "over 200k",
               "mean income", "median income", "unemployment pct.", "pct. not employed",
               "bachelor\'s degree pct.", "HS degree pct.", "pct. Asian", "pct. Black", "pct. non-Hispanic Black",
               "pct. Hispanic", "pct. multi-racial", "pct. White", "pct. all Whites",
               "pct. non-Hispanic", "pct. old", "median age", "gini", "DW-Nominate 1",
               "lagged Pres. Dem 50", "lagged win margin")

niceColNames <- c()

for(var in continuousCols) {
  i <- which(originalVariables == var)
  niceColNames <- c(niceColNames, goodNames[i])
}

cont.indices <- c()
for(col in continuousCols) {
  cont.indices <- c(cont.indices, which(names(train) == col)[1])
}

len <- length(cont.indices)

png(filename = "LinearityPlots1.png", width = 1000, height = 580)
par(mfrow = c(3, 3))
for(i in 1:(len/2)) {
  obs <- cont.indices[i]
  plot(results[,obs], results$DemWinMargin,
       xlab = niceColNames[i], ylab = "Dem. Win Margin",
       main = paste(niceColNames[i], sep = ""), cex.lab = 1.5,
       cex.axis = 1.5, cex.main = 2)
}
dev.off()

png(filename = "LinearityPlots2.png", width = 1000, height = 580)
par(mfrow = c(3, 3))
for(i in (len/2 + 1):len) {
  obs <- cont.indices[i]
  plot(results[,obs], results$DemWinMargin,
       xlab = niceColNames[i], ylab = "Dem. Win Margin",
       main = paste(niceColNames[i], sep = ""), cex.lab = 1.5,
       cex.axis = 1.5, cex.main = 2)
}
dev.off()

#make residual plot
y.hat <- fitted(simplestImputationLM)
u <- resid(simplestImputationLM)
sigma <- sigma.hat(simplestImputationLM)
par(mfrow = c(1, 1))
png("ResidualPlot.png", width = 1000, height = 400)
residual.plot(y.hat, u, sigma, main = "Residuals vs. Predicted Values", cex.lab = 1.5,
              cex.axis = 1.5, cex.main = 2)
dev.off()

png("QQPlot.png", width = 1000, height = 400)
plot(simplestImputationLM, which = 2, pch = 19, cex.lab = 1.5,
     cex.axis = 1.5, cex.main = 2, sub.caption = "", caption = "",
     main = "Normal Q-Q Plot")
dev.off()

# Uncontested Match ####

#try a basic matching procedure
contested <- relevant[results$uncontested == 0,]
uncontested <- relevant[results$uncontested == 1,]

OG.ID <- uncontested$ID

contested <- as.matrix(contested)
uncontested <- as.matrix(uncontested)

#for now, drop the NAs
######### COME BACK TO THIS
missing.contested <- c()
missing.uncontested <- c()
for(i in 1:nrow(contested)) {
  if(sum(is.na(contested[i,])) > 0) {
    missing.contested <- c(missing.contested, i)
  }
}

for(i in 1:nrow(uncontested)) {
  if(sum(is.na(uncontested[i,])) > 0) {
    missing.uncontested <- c(missing.uncontested, i)
  }
}

contested <- contested[-missing.contested,]
uncontested <- uncontested[-missing.uncontested,]

dims <- dim(contested)
contested <- as.numeric(contested)
dim(contested) <- dims

dims <- dim(uncontested)
uncontested <- as.numeric(uncontested)
dim(uncontested) <- dims

#precision matrix estimate
#using the contested group
S.inv <- solve(cov(uncontested[,-idcol]))

#set up matrix for Mahalanobis distances
m <- nrow(uncontested)
n <- nrow(contested)
d2 <- matrix(nrow = m, ncol = n)

for(i in 1:m) {
  d2[i,] <- mahalanobis(contested[,-idcol], center = uncontested[i,-idcol],
                        cov = S.inv, inverted = TRUE)
}

#do greedy matching with replacement
matches <- numeric(0)

for(i in 1:m) {
  #find minimum value
  matches[i] <- which.min(d2[i,])[1]
}

#get original indices (to correct for the NAs that were dropped)
IDs <- contested[matches, idcol]
imputed <- results$DemWinMargin[IDs]

# Check Matching Accuracy ####

#then check accuracy (see notes)
train.relevant <- train[,indices]
test.relevant <- test[,indices]
#true if Dem party control
train.relevant$partyControl <- as.numeric(train.relevant$partyControl == "D")
test.relevant$partyControl <- as.numeric(test.relevant$partyControl == "D")

train.relevant$ID <- seq.int(nrow(train.relevant))
test.relevant$ID <- seq.int(nrow(test.relevant))

train.con <- train.relevant[train$uncontested == 0,]
test.con <- test.relevant[test$uncontested == 0,]

contested <- as.matrix(train.con)
uncontested <- as.matrix(test.con)

#for now, drop the NAs
######### COME BACK TO THIS
missing.contested <- c()
missing.uncontested <- c()
for(i in 1:nrow(contested)) {
  if(sum(is.na(contested[i,])) > 0) {
    missing.contested <- c(missing.contested, i)
  }
}

for(i in 1:nrow(uncontested)) {
  if(sum(is.na(uncontested[i,])) > 0) {
    missing.uncontested <- c(missing.uncontested, i)
  }
}

contested <- contested[-missing.contested,]
uncontested <- uncontested[-missing.uncontested,]

dims <- dim(contested)
contested <- as.numeric(contested)
dim(contested) <- dims

dims <- dim(uncontested)
uncontested <- as.numeric(uncontested)
dim(uncontested) <- dims

#precision matrix estimate
#using the contested group
S.inv <- solve(cov(uncontested[,-idcol]))

#set up matrix for Mahalanobis distances
m <- nrow(uncontested)
n <- nrow(contested)
d2 <- matrix(nrow = m, ncol = n)

for(i in 1:m) {
  d2[i,] <- mahalanobis(contested[,-idcol], center = uncontested[i,-idcol],
                        cov = S.inv, inverted = TRUE)
}

#do greedy matching with replacement
matches <- numeric(0)
nmatches <- 5

multimatches <- matrix(rep(0, nmatches*m), nrow = m, ncol = nmatches)

for(i in 1:m) {
  #find minimum value
  matches[i] <- which.min(d2[i,])[1]
  
  sorted <- sort(d2[i,], index.return = T)$ix
  
  for(j in 1:nmatches) {
    multimatches[i,j] <- sorted[j]
  }
}

#random baseline
rand.matches <- sample(1:n, size = m, replace = T)
rand.ids <- contested[rand.matches, idcol]
rand.imputed <- train$DemWinMargin[rand.ids]

#get original indices (to correct for the NAs that were dropped)
#shoudl be easy now that there are IDs
originalIDs <- uncontested[,idcol]
IDs <- contested[matches, idcol]
imputed <- train$DemWinMargin[IDs]
#this requires like the original IDs
actuals <- test$DemWinMargin[originalIDs]
MAE.single <- mean(na.omit(abs(imputed - actuals)))
mean(na.omit(abs(rand.imputed - actuals)))
#try MSE

#now do the multimatches stuff
weighted <- imputed
unweighted <- imputed

for(i in 1:m) {
  obs.IDs <- contested[multimatches[i,], idcol]
  weighted[i] <- weighted.mean(x = train$DemWinMargin[obs.IDs],
                               w = train$predProbUncontested[obs.IDs])
  unweighted[i] <- mean(train$DemWinMargin[obs.IDs])
}

MAE.weighted.matching <- mean(na.omit(abs(weighted - actuals)))
MAE.unweighted.matching <- mean(na.omit(abs(unweighted - actuals)))

sum(na.omit(imputed * actuals) < 0)/length(na.omit(imputed * actuals))
sum(na.omit(weighted * actuals) < 0)/length(na.omit(weighted * actuals))
sum(na.omit(unweighted * actuals) < 0)/length(na.omit(unweighted * actuals))


gam1 <- gam(DemWinMargin ~ as.factor(year.x) + s(prcntBA) +
              s(prcntHS) + s(prcntAsian) + s(prcntWhiteAll) + s(dwnom1) +
              partyControl + s(prcntForeignBorn) + s(under10k) + s(over35k) +
              s(over50k) + s(over75k) + s(over100k) + s(over150k) + s(prcntNotEmploy) +
              s(gini) + s(prcntOld) + s(lagPresDiff50) +
              s(medianIncome) +
              s(lagDemWinMargin), 
            data = train, na.action = na.omit)
summary(gam1)

gam2 <- gam(DemWinMargin ~ as.factor(year.x) + s(prcntBA) +
                      s(prcntHS) + s(prcntAsian) + s(prcntWhiteAll) + s(dwnom1) +
                      partyControl + s(prcntForeignBorn) + s(under10k) + s(over35k) +
                      s(over50k) + s(over75k) + s(over100k) + s(over150k) + s(prcntNotEmploy) +
                      s(gini) + s(prcntOld) + s(lagPresDiff50) +
                      s(medianIncome), 
                    data = train, na.action = na.omit)
preds.gam <- predict(gam1, newdata = test)
MAE.gam <- mean(abs(preds.gam - test$DemWinMargin), na.rm = T)
sign <- preds.gam * test$DemWinMargin
sum(sign[!is.na(sign)] < 0)/length(sign[!is.na(sign)])

lm1 <- lm(DemWinMargin ~ as.factor(year.x) + prcntBA +
          prcntHS + prcntAsian + prcntWhiteAll + dwnom1 +
          partyControl + prcntForeignBorn + under10k + over35k +
          over50k + over75k + over100k + over150k + prcntNotEmploy +
          gini + prcntOld + lagPresDiff50 +
          medianIncome + lagUncontested +
          lagDemWinMargin, 
          data = train)
summary(lm1)
preds.lm <- predict(lm1, newdata = test)
mean(abs(preds.lm - test$winMargin), na.rm = T)


# Make MAE Table ####
methods <- c("Linear Model (Unweighted)", "Linear Model (Weighted)", "Random Forest",
             "Hot Deck (Single)", "Hot Deck (Unweighted Avg.)", "Hot Deck (Weighted Avg.)",
             "Generalized Additive Model")
MAEs <- c(MAE.unweight, MAE.weight, MAE.rf, MAE.single, MAE.unweighted.matching,
          MAE.weighted.matching, MAE.gam)
MAE.table <- data.frame("Method" = methods, "MAE" = MAEs)
stargazer(MAE.table, summary = F, rownames = F, digits = 4)

# Make Georgia Predictions ####
districts <- c("2012_GA.3", "2012_GA.8", "2012_GA.10")
IDs.GA <- 1:3
for(i in 1:length(districts)) {
  IDs.GA[i] <- which(results$distYear == districts[i])
}
GA <- results[IDs.GA,,drop = F]
GA$distYear

#same method order as above except the first one is now actual
methods <- c("Actual Results", methods)
predVoteShares.3 <- 1:8
predVoteShares.8 <- 1:8
predVoteShares.10 <- 1:8

#enter the actual results
predVoteShares.3[1] <- GA$DemWinMargin[1]
predVoteShares.8[1] <- GA$DemWinMargin[2]
predVoteShares.10[1] <- GA$DemWinMargin[3]

#unweighted linear model
ULM.GA <- predict(simplestImputationLM, newdata = GA, na.action = na.pass, 
                  interval = "confidence")

#weighted LM
WLM.GA <- predict(weightedImputationLM, newdata = GA, na.action = na.pass,
                  interval = "confidence")

#Random Forest
RF.GA <- predict(rf.imputation, newdata = GA, na.action = na.pass,
                 interval = "confidence")

#Hot deck single
single1 <- imputed[which(OG.ID == IDs.GA[1])]
single2 <- imputed[which(OG.ID == IDs.GA[2])]
single3 <- imputed[which(OG.ID == IDs.GA[3])]
HD.single <- c(single1, single2, single3)

#Hot deck unweighted avg
UW1 <- unweighted[which(OG.ID == IDs.GA[1])]
UW2 <- unweighted[which(OG.ID == IDs.GA[2])]
UW3 <- unweighted[which(OG.ID == IDs.GA[3])]
HD.UW <- c(UW1, UW2, UW3)

#Hot deck weighted avg
W1 <- weighted[which(OG.ID == IDs.GA[1])]
W2 <- weighted[which(OG.ID == IDs.GA[2])]
W3 <- weighted[which(OG.ID == IDs.GA[3])]
HD.W <- c(W1, W2, W3)

#GAM
GAM.GA <- predict(gam1, newdata = GA, na.action = na.pass, type = "link",
                  se.fit = TRUE)
SE <- GAM.GA$se.fit
fit <- GAM.GA$fit
upr <- fit + 1.96*SE
lwr <- fit - 1.96*SE

upr <- gam1$family$linkinv(upr)
lwr <- gam1$family$linkinv(lwr)

GAM.GA <- cbind(fit, lwr, upr)

#fill vectors
predVoteShares.3 <- c(-1, ULM.GA[1], WLM.GA[1], RF.GA[1],
                      HD.single[1], HD.UW[1], HD.W[1], GAM.GA[1])
predVoteShares.8 <- c(-1, ULM.GA[2], WLM.GA[2], RF.GA[2],
                      HD.single[2], HD.UW[2], HD.W[2], GAM.GA[2])
predVoteShares.10 <- c(-1, ULM.GA[2], WLM.GA[2], RF.GA[2],
                      HD.single[2], HD.UW[2], HD.W[2], GAM.GA[2])

#convert to statewide counts
statewide.GA <- data.frame("Method" = methods, "Dem. Vote Share" = NA,
                           "Rep. Vote Share" = NA, "Dem. Win Margin" = NA,
                           "Dem. Win Margin Confidence Interval" = NA)
get.district.counts <- function(nvoters, share) {
  demcount <- nvoters * (0.5 + 0.5*share)
  repcount <- nvoters * (0.5 - 0.5*share)
  return(c(demcount, repcount))
}

get.district.shares <- function(share) {
  demshare <- (0.5 + 0.5*share)
  repshare <- (0.5 - 0.5*share)
  return(c(demshare, repshare))
}

avg.contested <- 264759
reptotal.contested <- 157181 + 92410 + 75041 + 43335 + 189669 + 156689 + 192101 +
                      196968 + 119973 + 79550 + 159947
demtotal.contested <- 92399 + 162751 + 208861 + 234330 + 104365 + 95377 + 60052 +
                      90353 + 139148 + 201988 + 59245

reptotal.uncontested <- 232380 + 197789 + 211065
demtotal.uncontested <- 0

#make results table
distList <- 1:14
distList <- c(distList, "Total")
repubGA <- c(157181, 92410, 232380, 75041, 43335, 189669, 156689, 197789, 192101,
               211065, 196968, 119973, 79550, 159947)
repubGA <- c(repubGA, sum(repubGA))
democratGA <- c(92399, 162751, 0, 208861, 234330, 104365, 95377, 0, 60052, 0,
                  90353, 139148, 201988, 59245)
democratGA <- c(democratGA, sum(democratGA))
demShareTable <- democratGA/(repubGA + democratGA)
repShareTable <- repubGA/(repubGA + democratGA)
demWinMarginTable <- demShareTable - repShareTable

GA.chap1.table <- data.frame(District = distList, DemVotes = democratGA,
                             RepVotes = repubGA,
                             DemShare = round(demShareTable, digits = 2),
                             RepShare = round(repShareTable, digits = 2),
                             DWinMargin = round(demWinMarginTable, digits = 2))
stargazer(GA.chap1.table, summary = F, rownames = F, digits = 2)

contested.DemWinMargin <- (demtotal.contested - reptotal.contested)/(demtotal.contested +
                                                                       reptotal.uncontested +
                                                                       reptotal.contested)
uncontested.DemWinMargin <- ((demtotal.contested + demtotal.uncontested) - 
                            (reptotal.contested + reptotal.uncontested))/
                            (demtotal.contested + demtotal.uncontested +
                             reptotal.contested + reptotal.uncontested)

get.state.total <- function(share3, share8, share10) {
  counts3 <- get.district.counts(avg.contested, share3)
  counts8 <- get.district.counts(avg.contested, share8)
  counts10 <- get.district.counts(avg.contested, share10)
  demtotal <- demtotal.contested + counts3[1] + counts8[1] + counts10[1]
  reptotal <- reptotal.contested + counts3[2] + counts8[2] + counts10[2]
  return(c(demtotal, reptotal))
}

get.state.share <- function(share3, share8, share10) {
  shares3 <- get.district.shares(share3)
  shares8 <- get.district.shares(share8)
  shares10 <- get.district.shares(share10)
  demtotal <- (10*(0.5 + contested.DemWinMargin) + shares3[1] + shares8[1] + shares10[1])/13
  reptotal <- (10*(0.5 - contested.DemWinMargin) + shares3[2] + shares8[2] + shares10[2])/13
  return(c(demtotal, reptotal))
}

statewide.GA$Dem..Vote.Share[1] <- 0.5 + 0.5*uncontested.DemWinMargin
statewide.GA$Rep..Vote.Share[1] <- 0.5 - 0.5*uncontested.DemWinMargin
statewide.GA$Dem..Win.Margin[1] <- uncontested.DemWinMargin

for(i in 2:length(methods)) {
  GA.est <- get.state.share(predVoteShares.3[i], predVoteShares.8[i], predVoteShares.10[i])
  statewide.GA$Dem..Vote.Share[i] <- GA.est[1]
  statewide.GA$Rep..Vote.Share[i] <- GA.est[2]
  statewide.GA$Dem..Win.Margin[i] <- 2*(GA.est[1] - 0.5)
}

#do the CIs
get.CI <- function(mydata, center) {
  lwr <- mydata[,2]
  upr <- mydata[,3]
  radius <- sqrt(sum((upr - lwr)^2))
  upp <- center + radius
  low <- center - radius
  return(paste("[",format(low, digits = 2),", ",
               format(upp, digits = 2),"]", sep = ""))
}

statewide.GA$Dem..Win.Margin.Confidence.Interval[2] <- get.CI(ULM.GA, statewide.GA$Dem..Win.Margin[2])
statewide.GA$Dem..Win.Margin.Confidence.Interval[3] <- get.CI(WLM.GA, statewide.GA$Dem..Win.Margin[3])
statewide.GA$Dem..Win.Margin.Confidence.Interval[8] <- get.CI(GAM.GA, statewide.GA$Dem..Win.Margin[8])

stargazer(statewide.GA, summary = F, rownames = F, digits = 2)

# National Predictions ####
#ensure no duplicates
all2012 <- results[results$year.x == 2012,]
dists <- c()
dropIDs <- c()
for(i in 1:nrow(all2012)) {
  if(all2012$distYear[i] %in% dists) {
    dropIDs <- c(dropIDs, i)
  } else {
    dists <- c(dists, all2012$distYear[i])
  }
}
all2012 <- all2012[-dropIDs,]

all2012$imputed <- predict(gam1, newdata = all2012, na.action = na.pass,
                           type = "response")
states.with.uncontested <- unique(all2012$state.x[all2012$uncontested == 1])

national.table <- data.frame("State" = NA, "Raw D Share" = NA, "Raw R Share" = NA, 
                             "Raw D Win Margin" = NA,
                             "Imputed D Share" = NA, "Imputed R Share" = NA,
                             "Imputed D Win Margin"= NA)

get.state.row <- function(stateName) {
  state <- all2012[all2012$state.x == stateName,]
  rawDTotal <- sum(state$dVotes[!is.na(state$dVotes)])
  rawRTotal <- sum(state$rVotes[!is.na(state$rVotes)])
  rawDShare <- rawDTotal/(rawDTotal + rawRTotal)
  rawRShare <- rawRTotal/(rawDTotal + rawRTotal)
  rawDWinMargin <- rawDShare - rawRShare
  
  contested.state <- state[state$uncontested == 0,]
  uncontested.state <- state[state$uncontested == 1,]
  weight.ratio <- nrow(contested.state)/(nrow(state))
  dvotes <- sum(contested.state$dVotes[!is.na(contested.state$dVotes)])
  rvotes <- sum(contested.state$rVotes[!is.na(contested.state$rVotes)])
  contested.state.margin <- (dvotes - rvotes)/(dvotes + rvotes)
  
  if(stateName == "North Dakota") {
    contested.state.margin <- 0
  }
  
  predMargin <- weight.ratio * contested.state.margin + 
                  (1 - weight.ratio) * mean(uncontested.state$imputed)
  predDShare <- 0.5 + 0.5 * predMargin
  predRShare <- 0.5 - 0.5 * predMargin
  
  return(round(c(rawDShare, rawRShare, rawDWinMargin, predDShare, predRShare, predMargin),
               digits = 2))
}

for(state in states.with.uncontested) {
  row <- get.state.row(state)
  row <- c(state, row)
  national.table <- rbind(national.table, setNames(row, names(national.table)))
}

#clean up table
national.table <- national.table[-1,]

#now compute national vote share
rawDTotal <- sum(all2012$dVotes[!is.na(all2012$dVotes)])
rawRTotal <- sum(all2012$rVotes[!is.na(all2012$rVotes)])
rawDShare <- rawDTotal/(rawDTotal + rawRTotal)
rawRShare <- rawRTotal/(rawDTotal + rawRTotal)
rawDWinMargin <- rawDShare - rawRShare

contested.nat <- all2012[all2012$uncontested == 0,]
uncontested.nat <- all2012[all2012$uncontested == 1,]
weight.ratio <- nrow(contested.nat)/(nrow(all2012))
dvotes <- sum(contested.nat$dVotes[!is.na(contested.nat$dVotes)])
rvotes <- sum(contested.nat$rVotes[!is.na(contested.nat$rVotes)])
contested.nat.margin <- (dvotes - rvotes)/(dvotes + rvotes)

predMargin <- weight.ratio * contested.nat.margin + 
  (1 - weight.ratio) * mean(uncontested.nat$imputed)
predDShare <- 0.5 + 0.5 * predMargin
predRShare <- 0.5 - 0.5 * predMargin

row <- round(c(rawDShare, rawRShare, rawDWinMargin, predDShare, predRShare, predMargin),
             digits = 3)
row <- c("National", row)
national.table <- rbind(national.table, setNames(row, names(national.table)))

stargazer(national.table, summary = F, row.names = F, digits = 3)

# Model Output for Appendix ####
stargazer(simplestImputation$finalModel, single.row = T, no.space = T,
          title = "Unweighted Linear Model")
stargazer(weightedImputation$finalModel, single.row = T, no.space = T,
          title = "Weighted Linear Model")

stargazer(gam1, single.row = T, no.space = T,
          title = "Generalized Additive Model, Main Terms")
stargazer(summary(gam1)$s.table, single.row = T, no.space = T,
          title = "Generalized Additive Model, Smooth Terms")
